import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import iplot
from plotly.graph_objs import Scatter
import seaborn as sns
import squarify
import math
import numpy as np
from scipy import stats as st
import numpy as np
pd.options.display.max_columns = None
df_games = pd.read_csv('games.csv')
df_games.columns = df_games.columns.str.lower()
df_games.head()
df_games.describe()
df_games.describe(include = ['object'])
df_games.info()
df_games.shape
#how many NULL are ther in df_games
df_games.isnull().sum()
df_games[df_games['name'].isna()]
df_games['name'] = df_games['name'].fillna('missing name')
df_games['genre'] = df_games['genre'].fillna('missing genre')
df_games[df_games['year_of_release'].isna()].head()
def year_of_release(row):
name = row['name']
for year in range(1980, 2017):
if str(year) in name:
return year
return None
df_games['year_of_release'] = df_games['year_of_release'].fillna(df_games.apply(year_of_release, axis = 1))
platform_year_of_release_df = df_games.pivot_table(index = 'platform', values = 'year_of_release', aggfunc = 'min').reset_index()
platform_year_of_release_df.head()
def platform_release(row):
platform = row['platform']
year_df = platform_year_of_release_df[platform_year_of_release_df['platform'] == platform]
year = year_df['year_of_release'].values.tolist()
return year[0]
df_games['year_of_release'] = df_games['year_of_release'].fillna(df_games.apply(platform_release, axis = 1))
df_games.year_of_release = df_games.year_of_release.astype(int)
df_games['critic_score'].fillna(0, inplace=True)
genre_year_of_release_critic_score_df = df_games[df_games['critic_score'] != 0].pivot_table(index = ('genre', 'year_of_release'), values = 'critic_score', aggfunc = 'mean' ).reset_index()
genre_year_of_release_critic_score_df['critic_score_mean'] = genre_year_of_release_critic_score_df['critic_score']
del genre_year_of_release_critic_score_df['critic_score']
genre_year_of_release_critic_score_df.head()
def critic_score_update(row):
critic_score = row['critic_score']
genre = row['genre']
year_of_release = row['year_of_release']
if critic_score == 0:
critic_score_df = genre_year_of_release_critic_score_df[(genre_year_of_release_critic_score_df['genre'] == genre) & (genre_year_of_release_critic_score_df['year_of_release'] == year_of_release)]['critic_score_mean']
if critic_score_df.empty:
return 0
critic_score_mean = critic_score_df.values.tolist()
return critic_score_mean[0]
return critic_score
df_games['critic_score'] = df_games.apply(critic_score_update, axis = 1)
df_games['user_score'].fillna(100, inplace=True)
def user_score_update(row):
user_score = row['user_score']
if user_score == 'tbd':
return 100
return user_score
df_games['user_score'] = df_games.apply(user_score_update, axis = 1)
df_games.user_score = df_games.user_score.astype(float)
genre_year_of_release_user_score_df = df_games[df_games['user_score'] != 100].pivot_table(index = ('genre', 'year_of_release'), values = 'user_score', aggfunc = 'mean' ).reset_index()
genre_year_of_release_user_score_df['user_score_mean'] = genre_year_of_release_user_score_df['user_score']
del genre_year_of_release_user_score_df['user_score']
genre_year_of_release_user_score_df.head()
def user_score_update(row):
user_score = row['user_score']
genre = row['genre']
year_of_release = row['year_of_release']
if user_score == 100:
user_score_df = genre_year_of_release_user_score_df[(genre_year_of_release_user_score_df['genre'] == genre) & (genre_year_of_release_user_score_df['year_of_release'] == year_of_release)]['user_score_mean']
if user_score_df.empty:
return 0
user_score_mean = user_score_df.values.tolist()
return user_score_mean[0]
return user_score
df_games['user_score'] = df_games.apply(user_score_update, axis = 1)
df_games.pivot_table(index = ('year_of_release', 'genre'), columns = 'rating', values = 'name', aggfunc = 'count' ).reset_index().head()
df_games['rating'].fillna('mis', inplace=True)
df_games.describe()
df_games.describe(include = ['object'])
df_games.isnull().sum()
df_games['total_sales'] = df_games[['na_sales', 'eu_sales', 'jp_sales', 'other_sales']].sum(axis = 1)
df_games[['name', 'total_sales']].head()
amount_year_of_release_df = df_games[['year_of_release','name',]].groupby('year_of_release').count().sort_values(by = 'year_of_release').reset_index()
amount_year_of_release_df.head()
fig, ax = plt.subplots(figsize = (17, 10))
ax.vlines(x = amount_year_of_release_df.year_of_release, ymin = 0, ymax = amount_year_of_release_df.name, color = 'purple', alpha = 0.7, linewidth = 2)
ax.scatter(x = amount_year_of_release_df.year_of_release, y = amount_year_of_release_df.name, s = 75, color = 'black', alpha = 0.7)
#set title
ax.set_title('Lollipop Chart for Released Games', fontdict = {'size': 15})
#set lable for Y
ax.set_ylabel('Number of Games')
#set ticks
ax.set_xticks(amount_year_of_release_df.year_of_release)
ax.set_xticklabels(amount_year_of_release_df.year_of_release, rotation = 90, fontdict = {'horizontalalignment' : 'center', 'size': 12})
#add text above the dot
for row in amount_year_of_release_df.itertuples():
ax.text(row.year_of_release, row.name + 30, s = round(row.name, 2))
#find total sales by playform
sales_df = df_games[['platform', 'total_sales']].groupby('platform').sum().sort_values('total_sales').reset_index()
sales_df.head()
sales_df['sales_z'] = (sales_df['total_sales'] - sales_df['total_sales'].mean()) / sales_df['total_sales'].std()
sales_df.head()
#for sales_z above 0, paint green.
#for sales_z below 0, paint red.
sales_df['colors'] = ['red' if x < 0 else 'green' for x in sales_df['sales_z']]
sales_df.head()
sales_df.tail()
plt.figure(figsize = (14, 10))
#create a graph with horizontal lines
plt.hlines(y =sales_df.platform, xmin = 0, xmax = sales_df.sales_z, color = sales_df.colors, alpha = 0.4, linewidth = 10)
df_tree = sales_df[sales_df['total_sales'] >= 1]
sizes = df_tree['total_sales'].values.tolist()
labels = df_tree.apply(lambda x: str(x[0]) + '\n' + '$' + str(round(x[1])), axis = 1)
plt.figure(figsize = (15, 9))
squarify.plot(sizes = sizes, label = labels, alpha = 0.5)
#I will choose platforms that have total sales of over 700 million dollasr
big_salses_list = sales_df[sales_df['total_sales'] > 700]['platform'].values.tolist()
big_salses_df = df_games[df_games['platform'].isin(big_salses_list)][['platform', 'year_of_release', 'total_sales']]
big_salses_df.head()
big_salses_df.shape
sales_by_year_df = big_salses_df.pivot_table(index = ('platform', 'year_of_release'), values = 'total_sales', aggfunc = 'sum').sort_values('year_of_release').reset_index()
sales_by_year_df.head()
sales_by_year_df.shape
year_min = sales_by_year_df.year_of_release.min()
year_max = sales_by_year_df.year_of_release.max()
for year in range(year_min, year_max + 1):
print('Distribution for year:', year)
sizes = sales_by_year_df[sales_by_year_df['year_of_release'] == year]['total_sales'].values.tolist()
labels = sales_by_year_df[sales_by_year_df['year_of_release'] == year].apply(lambda x: str(x[0]) + '\n' + '$' + str(round(x[2])), axis = 1)
plt.figure(figsize = (10, 6))
squarify.plot(sizes = sizes, label = labels, alpha = 0.5)
plt.show()
#will show from 1994
sales_by_year_df = sales_by_year_df[sales_by_year_df['year_of_release'] >= 1994]
fig, ax = plt.subplots(figsize = (15, 10))
colors = ['red', 'gold', 'black', 'violet', 'cyan', 'peru']
i = 0
for platform in big_salses_list:
df = sales_by_year_df[(sales_by_year_df['platform'] == platform) ]
df.plot(kind = 'line', x = 'year_of_release', y = 'total_sales', color = colors[i], ax = ax, label = platform)
i += 1
ax.grid(axis = 'x')
#set ticks
ax.set_xticks(sales_by_year_df.year_of_release)
ax.set_xticklabels(sales_by_year_df.year_of_release, rotation = 90, fontdict = {'horizontalalignment' : 'center', 'size': 12})
plt.show()
list_platform = df_games[df_games['year_of_release'] > 2002]['platform'].unique()
#platform from the last 10 years.
sales_by_year_df_10 = df_games.pivot_table(index = ('platform', 'year_of_release'), values = 'total_sales', aggfunc = 'sum').sort_values('year_of_release').reset_index()
sales_by_year_df_10 = sales_by_year_df_10[sales_by_year_df_10['year_of_release'] >= 2002]
fig, ax = plt.subplots(figsize = (15, 10))
colors = ['red', 'gold', 'black', 'violet', 'cyan', 'peru', 'gray', 'tan', 'darkgreen', 'blue', 'orange', 'fuchsia', 'y', 'aqua', 'brown', 'tomato', 'orchid']
i = 0
for platform in list_platform:
df_10 = sales_by_year_df_10[(sales_by_year_df_10['platform'] == platform)]
df_10.plot(kind = 'line', x = 'year_of_release', y = 'total_sales', color = colors[i], ax = ax, label = platform)
i += 1
ax.grid(axis = 'x')
#set ticks
ax.set_xticks(sales_by_year_df_10.year_of_release)
ax.set_xticklabels(sales_by_year_df_10.year_of_release, rotation = 90, fontdict = {'horizontalalignment' : 'center', 'size': 12})
plt.show()
sales_by_year_df_10.head()
#find total sale by year for each platform
df_games_total_sales = df_games[df_games['year_of_release'] >= 2000 ].pivot_table(index = 'year_of_release', columns = 'platform', values = 'total_sales', aggfunc = 'sum', fill_value = 0)
df_games_total_sales.tail()
df_games_total_sales.shift(+1).tail()
dynamics_df = df_games_total_sales - df_games_total_sales.shift(+1)
dynamics_df.tail()
plt.figure(figsize = (13, 9))
sns.heatmap(dynamics_df.T, cmap = 'RdBu_r') #use RED & BLUE colors
grouped_df = df_games[df_games['year_of_release'] > 2000].groupby(['platform', 'year_of_release'])['total_sales'].sum().reset_index()
grouped_df.head()
ordered_df = grouped_df.groupby(['platform'])['total_sales'].sum().sort_values().reset_index()['platform']
plt.figure(figsize = (13, 10))
sns.boxplot(x = 'platform', y = 'total_sales', data = grouped_df, order = ordered_df)
ps2_df = df_games[df_games['platform'] == 'PS2'][['name', 'year_of_release', 'genre', 'total_sales', 'critic_score', 'user_score']]
ps2_df.head()
#total sales by year
ps2_df.pivot_table(index = 'year_of_release', values = 'total_sales', aggfunc = 'sum')
#mean critic_score by year
ps2_df.pivot_table(index = 'year_of_release', values = 'critic_score', aggfunc = 'mean')
ps2_values_df = ps2_df.pivot_table(index = 'year_of_release', values = ('total_sales', 'critic_score', 'user_score'), aggfunc = ('sum', 'mean')).reset_index()
del ps2_values_df[('critic_score', 'sum')]
del ps2_values_df[('total_sales', 'mean')]
del ps2_values_df[('user_score', 'sum')]
ps2_values_df.columns = ['year_of_release', 'critic_score_mean', 'total_sales_sum', 'user_score_mean']
ps2_values_df
fig, ax = plt.subplots(figsize = (10, 5))
#ax.vlines(x = amount_year_of_release_df.year_of_release, ymin = 0, ymax = amount_year_of_release_df.name, color = 'purple', alpha = 0.7, linewidth = 2)
ax.scatter(x = ps2_values_df.total_sales_sum, y = ps2_values_df.critic_score_mean, s = 75, color = 'black', alpha = 0.7)
#set title
ax.set_title('Total sales by critic score', fontdict = {'size': 15})
#set lable for Y
ax.set_ylabel('critic score mean')
fig, ax = plt.subplots(figsize = (10, 5))
#ax.vlines(x = amount_year_of_release_df.year_of_release, ymin = 0, ymax = amount_year_of_release_df.name, color = 'purple', alpha = 0.7, linewidth = 2)
ax.scatter(x = ps2_values_df.total_sales_sum, y = ps2_values_df.user_score_mean, s = 75, color = 'black', alpha = 0.7)
#set title
ax.set_title('Total sales by user score', fontdict = {'size': 15})
#set lable for Y
ax.set_ylabel('user score mean')
print(ps2_values_df.total_sales_sum.corr(ps2_values_df.critic_score_mean) )
print(ps2_values_df.total_sales_sum.corr(ps2_values_df.user_score_mean) )
ps2_name_list = ps2_df.loc[:, 'name'].unique()
df_games[(df_games['name'].isin(ps2_name_list)) & (df_games['platform'] != 'PS2')].sort_values(by = 'name').head()
#which plaftomrs:
platforms_list = df_games[(df_games['name'].isin(ps2_name_list)) & (df_games['platform'] != 'PS2')].sort_values(by = 'name')['platform'].unique()
df_games_other_platforms = df_games[(df_games['name'].isin(ps2_name_list)) & (df_games['platform'] != 'PS2')].pivot_table(index = ('platform', 'year_of_release'), values = ('critic_score', 'user_score', 'total_sales'), aggfunc = ('mean', 'sum')).reset_index()
del df_games_other_platforms[('critic_score', 'sum')]
del df_games_other_platforms[('total_sales', 'mean')]
del df_games_other_platforms[('user_score', 'sum')]
df_games_other_platforms.columns = ['platform', 'year_of_release', 'critic_score_mean', 'total_sales_sum', 'user_score_mean']
df_games_other_platforms.head()
print('Correlation between Total sales and Critic score:')
for platform in platforms_list:
print()
print('************')
print('for platform:', platform)
df = df_games_other_platforms[df_games_other_platforms['platform'] == platform]
print(df.total_sales_sum.corr(df.critic_score_mean))
print('Correlation between Total sales and Used score:')
for platform in platforms_list:
print()
print('************')
print('for platform:', platform)
df = df_games_other_platforms[df_games_other_platforms['platform'] == platform]
print(df.total_sales_sum.corr(df.user_score_mean))
df_games[df_games['year_of_release'] >= 2000].pivot_table(index = ('genre'), values = 'total_sales', aggfunc = 'sum' ).sort_values(by = 'total_sales')
#find total sales by playform
sales_na_df = df_games[['platform', 'na_sales']].groupby('platform').sum().sort_values('na_sales').reset_index()
#sales_na_df
sales_na_df['sales_z'] = (sales_na_df['na_sales'] - sales_na_df['na_sales'].mean()) / sales_na_df['na_sales'].std()
sales_na_df['colors'] = ['red' if x < 0 else 'green' for x in sales_na_df['sales_z']]
plt.figure(figsize = (14, 10))
#create a graph with horizontal lines
plt.hlines(y =sales_na_df.platform, xmin = 0, xmax = sales_na_df.sales_z, color = sales_na_df.colors, alpha = 0.4, linewidth = 10)
sales_na_five_df = sales_na_df.tail().reset_index()
total_na = sales_na_five_df['na_sales'].sum()
fig, ax = plt.subplots(figsize = (8, 4))
ax.vlines(x = sales_na_five_df.platform, ymin = 0, ymax = sales_na_five_df.na_sales, color = 'purple', alpha = 0.7, linewidth = 2)
ax.scatter(x = sales_na_five_df.platform, y = sales_na_five_df.na_sales, s = 75, color = 'black', alpha = 0.7)
#set title
ax.set_title('Lollipop Chart for sales in North America with share in market', fontdict = {'size': 15})
#set lable for Y
ax.set_ylabel('sales')
#add text above the dot
for row in sales_na_five_df.itertuples():
ax.text(row.platform, row.na_sales + 30, s = str(round(row.na_sales, 2)) + ' : '+ str( round((row.na_sales / total_na) * 100, 2)) + '%')
#find total sales by playform
sales_eu_df = df_games[['platform', 'eu_sales']].groupby('platform').sum().sort_values('eu_sales').reset_index()
sales_eu_df['sales_z'] = (sales_eu_df['eu_sales'] - sales_eu_df['eu_sales'].mean()) / sales_eu_df['eu_sales'].std()
sales_eu_df['colors'] = ['red' if x < 0 else 'green' for x in sales_eu_df['sales_z']]
plt.figure(figsize = (14, 10))
#create a graph with horizontal lines
plt.hlines(y =sales_eu_df.platform, xmin = 0, xmax = sales_eu_df.sales_z, color = sales_eu_df.colors, alpha = 0.4, linewidth = 10)
sales_eu_five_df = sales_eu_df.tail().reset_index()
total_eu = sales_eu_five_df['eu_sales'].sum()
fig, ax = plt.subplots(figsize = (10, 5))
ax.vlines(x = sales_eu_five_df.platform, ymin = 0, ymax = sales_eu_five_df.eu_sales, color = 'purple', alpha = 0.7, linewidth = 2)
ax.scatter(x = sales_eu_five_df.platform, y = sales_eu_five_df.eu_sales, s = 75, color = 'black', alpha = 0.7)
#set title
ax.set_title('Lollipop Chart for sales in Europe with share in market', fontdict = {'size': 15})
#set lable for Y
ax.set_ylabel('sales')
#add text above the dot
for row in sales_eu_five_df.itertuples():
ax.text(row.platform, row.eu_sales + 30, s = str(round(row.eu_sales, 2)) + ' : '+ str( round((row.eu_sales / total_eu) * 100, 2)) + '%')
#find total sales by playform
sales_jp_df = df_games[['platform', 'jp_sales']].groupby('platform').sum().sort_values('jp_sales').reset_index()
sales_jp_df['sales_z'] = (sales_jp_df['jp_sales'] - sales_jp_df['jp_sales'].mean()) / sales_jp_df['jp_sales'].std()
sales_jp_df['colors'] = ['red' if x < 0 else 'green' for x in sales_jp_df['sales_z']]
plt.figure(figsize = (14, 10))
#create a graph with horizontal lines
plt.hlines(y =sales_jp_df.platform, xmin = 0, xmax = sales_jp_df.sales_z, color = sales_jp_df.colors, alpha = 0.4, linewidth = 10)
sales_jp_five_df = sales_jp_df.tail().reset_index()
total_jp = sales_jp_five_df['jp_sales'].sum()
fig, ax = plt.subplots(figsize = (10, 5))
ax.vlines(x = sales_jp_five_df.platform, ymin = 0, ymax = sales_jp_five_df.jp_sales, color = 'purple', alpha = 0.7, linewidth = 2)
ax.scatter(x = sales_jp_five_df.platform, y = sales_jp_five_df.jp_sales, s = 75, color = 'black', alpha = 0.7)
#set title
ax.set_title('Lollipop Chart for sales in Japan with share in market', fontdict = {'size': 15})
#set lable for Y
ax.set_ylabel('sales')
#add text above the dot
for row in sales_jp_five_df.itertuples():
ax.text(row.platform, row.jp_sales + 30, s = str(round(row.jp_sales, 2)) + ' : '+ str( round((row.jp_sales / total_jp) * 100, 2)) + '%')
genre_na_df = df_games[['genre', 'na_sales']].groupby('genre').sum().sort_values('na_sales').reset_index()
genre_na_df.tail()
genre_eu_df = df_games[['genre', 'eu_sales']].groupby('genre').sum().sort_values('eu_sales').reset_index()
genre_eu_df.tail()
genre_jp_df = df_games[['genre', 'jp_sales']].groupby('genre').sum().sort_values('jp_sales').reset_index()
genre_jp_df.tail()
#for North America
rating_na_df = df_games[['rating', 'na_sales']].groupby('rating').sum().sort_values('rating').reset_index()
rating_na_df.drop(rating_na_df[rating_na_df.rating == 'mis'].index, inplace=True)
#for Europe
rating_eu_df = df_games[['rating', 'eu_sales']].groupby('rating').sum().sort_values('rating').reset_index()
rating_eu_df.drop(rating_eu_df[rating_eu_df.rating == 'mis'].index, inplace=True)
#for Japan
rating_jp_df = df_games[['rating', 'jp_sales']].groupby('rating').sum().sort_values('rating').reset_index()
rating_jp_df.drop(rating_jp_df[rating_jp_df.rating == 'mis'].index, inplace=True)
branches = rating_jp_df['rating'].values.tolist()
na_genre_list = rating_na_df['na_sales'].values.tolist()
eu_genre_list = rating_eu_df['eu_sales'].values.tolist()
jp_genre_list = rating_jp_df['jp_sales'].values.tolist()
trace_na = go.Bar(
x = branches,
y = na_genre_list,
name = 'NA'
)
trace_eu = go.Bar(
x = branches,
y = eu_genre_list,
name = 'EU'
)
trace_jp = go.Bar(
x = branches,
y = jp_genre_list,
name = 'JP'
)
data = [trace_na, trace_eu, trace_jp]
layout = go.Layout(barmode = 'group')
fig = go.Figure(data = data, layout = layout)
iplot(fig)
xone_user_rating = df_games[df_games['platform'] == 'XOne']['user_score']
pc_user_rating = df_games[df_games['platform'] == 'PC']['user_score']
#the significance level is 5%
alpha = 0.05
results = st.ttest_ind(xone_user_rating, pc_user_rating)
print('P-value:', results.pvalue)
if(results.pvalue < alpha):
print('We reject the null hypothesis')
else:
print('We can not reject the null hypothesis')
Action_user_rating = df_games[df_games['genre'] == 'Action']['user_score']
sport_user_rating = df_games[df_games['genre'] == 'Sports']['user_score']
#the significance level is 5%
alpha = 0.05
results = st.ttest_ind(Action_user_rating, sport_user_rating)
print('P-value:', results.pvalue)
if(results.pvalue < alpha):
print('We reject the null hypothesis')
else:
print('We can not reject the null hypothesis')